Analysis parameters:
rsfMRI data Parameters:
rsfMRI: R1S1 mr_pcorr.txt file
Z norm before load: False
upsampling: UP
ML Parameters: - Z norm in glmnet: True
W: normalized maxLL_difference (min max
normalization)
Hyperparameter tuning: nfolds = length(Y) best lambda
= median(nested CV lambdas)
Model evaluation: averaged accuracy/acu_roc score of nested CV
Nested CV Parameters
Betas: |mean beta| > 0.05
Round = 200, CV folds = 20
Train/Test Split: 1:3, Seed=0
Lambda: 1se median of min lambdas from 200
rounds
Load intermediate data after running
analysis_lasso_re_preapre_data.Rmd
load(file = "../data/__cache__/worksapce_prepare_data.RData")
We can now define the lasso model. We will use the elastic net approach with \(\alpha = 0\) to generate a pure lasso model. The model will use a binomial (i.e., logistic) regression and will measure the cross-validation error as class misalignment.
To analyze the data, we will use Lasso, a statistical learning system based on penalized regression.
Most of the entries in our \(Y\) vector are coded as “0” (i.e., most participants use the memory strategy), which creates an imbalance. We are going to create an appropriate set of weights so that the two classes are balanced.
New: Instead of using proportion as weights, we apply maxLL_difference as weight to increase importance of individual data who has larger maxLL_difference.
# based on proportion of two labels
W <- Y
W[W == 0] <- mean(Y)
W[W == 1] <- (1-mean(Y))
# normalize maxLL diff
min_max_norm <- function(x) {
(x - min(x)) / (max(x) - min(x))
}
# obtain weights based on the absolute value of maxLL diff between model1 and model2
W <- min_max_norm(abs(dvs.up$LL.diff))
To choose the optimal value of \(\lambda\) in Lasso, we will examine the cross-validation error during a LOO cross-validation.
Alternatively, if we split dataset into train/test, whether we could effectively predict? It turns out that the testing accuracy for unseen data is bad (0.63).
It’s better to pursue consensus feature approach get betas
USE_TRAIN_TEST_SPLIT <- FALSE
set.seed(0)
n = length(Y)
train = sample(1:n, 3*n/4) # by default replace=FALSE in sample()
test = (1:n)[-train]
if (USE_TRAIN_TEST_SPLIT) {
# find optimal lambda using training dataset
fit.cv = cv.glmnet(X[train,], Y[train],
alpha=1,
weights = W[train],
family = "binomial",
type.measure = "class",
nfolds=length(train),
keep=T,
standardize=T)
fit = glmnet(X, Y,
alpha = 1,
lambda = fit.cv$lambda.min,
family = "binomial",
weights = W,
type.measure = "class",
standardize=T,
grouped = T)
} else {
fit <- glmnet(y = Y,
x = X,
alpha=1,
#lambda = fit.cv$lambda.min,
weights = W,
family = "binomial",
type.measure = "class",
standardize = T
)
# LOO CV to find optimal lambda
fit.cv <- cv.glmnet(y = Y,
x = X,
alpha=1,
family = "binomial",
weights = W,
type.measure = "class",
standardize=T,
nfolds=length(Y),
grouped = F,
keep = T
)
}
Now, let’s look at the cross-validation error profile.
plot(fit.cv)
The profile has the characteristic U-shape or increasing curve, with
more error as \(\lambda\) increases.
As recommended by Tibishirani, we will select the “lambda 1SE”
value, which is the largest \(\lambda\) value that does not differ by
more tha 1 SE from the \(\lambda\)
value that gives us the minimum cross validation error. This guarantees
the maximum generalizability.
We will select the “lambda min” value, leaving approximately 68 non-zero connections.
We can also visualize the profile of the beta weights of each connection for different values of \(\lambda\).
plot(fit, sub="Beta Values for Connectivity")
L1norm <- sum(abs(fit$beta[,which(fit$lambda==fit.cv$lambda.min)]))
abline(v=L1norm, lwd=2, lty=2)
And now, plot prettier version
lasso_df <- as_tibble(data.frame(lambda=fit.cv$lambda,
error=fit.cv$cvm,
sd=fit.cv$cvsd))
ggplot(lasso_df, aes(x=lambda, y=error)) +
geom_line(aes(col=error), lwd=2) +
scale_color_viridis("Error", option = "plasma") +
geom_ribbon(aes(ymin=error -sd, ymax=error + sd), alpha=0.2,fill="blue") +
xlab(expression(lambda)) +
ylab("Cross-Validation Error") +
ggtitle(expression(paste(bold("Cross Validation Error Across "), lambda))) +
geom_vline(xintercept = lasso_df$lambda[lasso_df$error==min(lasso_df$error)]) +
theme_pander() +
theme(legend.position="right")
The min \(\lambda_{min}\) is 0.0204755, The 1se min \(\lambda_{min}\) is 0.0374853
plot_prediction <- function(Y, Yp, weighted_score, title) {
wcomparison <- tibble(Observed = Y,
Predicted = Yp,
DiscretePredicted = ifelse(Yp < 0.5, 0, 1))
wcomparison %<>% mutate(Accuracy = ifelse(DiscretePredicted == Observed,
"Correct",
"Misclassified")) %>% drop_na()
#rval <- floor(100*cor(Y, Yp))/100
aval <- round(100*nrow(dplyr::filter(wcomparison, Accuracy %in% "Correct")) / nrow(wcomparison),2)
# update weighted score
if (weighted_score > 0) {
accuracy_score <- weighted_score
} else {
accuracy_score <- aval
}
p <- ggplot(wcomparison, aes(x=Predicted, y=Observed,
col=Accuracy)) +
geom_point(size=4, alpha=0.6,
position= position_jitter(height = 0.02)) +
geom_abline(intercept = 0, slope = 1,
col="red",
linetype="dashed") +
scale_color_d3() +
theme_pander() +
theme(legend.position = "right") +
guides(col=guide_legend("Classification")) +
coord_fixed(xlim=c(0, 1), ylim=c(0, 1)) +
annotate("text", x=0.3, y=0.7,
label=paste("Accuracy (",
length(Y),
") = ",
accuracy_score,
"%",
sep="")) +
ylab("Observed Strategy") +
xlab("Predicted Strategy") +
ggtitle(paste("Predicted vs. Observation",title)) +
theme(legend.position = "bottom")
ggMarginal(p,
fill="grey",
alpha=0.75,
type="density", #bins=13,
col="darkgrey",
margins = "both")
return (p)
}
plot_roc <- function(Y, Yp, weighted_score, title) {
wcomparison <- tibble(Observed = Y,
Predicted = Yp,
DiscretePredicted = ifelse(Yp < 0.5, 0, 1))
wcomparison %<>% mutate(Accuracy = ifelse(DiscretePredicted == Observed,
"Correct",
"Misclassified")) %>% drop_na()
wcomparison %<>% mutate(ROCPrediction = if_else(Predicted < 0.5, 0, 1))
rocobj <- roc(wcomparison$Observed, wcomparison$ROCPrediction)
if (weighted_score > 0) {
roc_score <- round(weighted_score, 4)
} else {
roc_score <- round(rocobj$auc[[1]], 4)
}
g <- ggroc(rocobj, col="red") +
geom_point(aes(y=rocobj$sensitivities, x=rocobj$specificities), col="red", size=4, alpha=.5) +
ggtitle(paste("AUC ROC Curve", title, roc_score)) +
xlab("Specificity (FPR)") + ylab("Sensitivity (TPR)") +
geom_segment(aes(x = 1, xend = 0, y = 0, yend = 1), color="grey", linetype="dashed") +
theme_pander()
g
}
plot_roc_slide <- function(Y, Yp, title) {
wcomparison <- tibble(Observed = Y,
Predicted = Yp,
DiscretePredicted = ifelse(Yp < 0.5, 0, 1))
wcomparison %<>% mutate(Accuracy = ifelse(DiscretePredicted == Observed,
"Correct",
"Misclassified")) %>% drop_na()
wcomparison %<>% mutate(ROCPrediction = if_else(Predicted < 0.5, 0, 1))
curve <- NULL
for (threshold in seq(0, 1, 0.01)) {
subthreshold <- wcomparison %>%
mutate(Prediction = ifelse(Predicted > 1, 1, Predicted)) %>%
mutate(Prediction = ifelse(Prediction <= 0, 1e-204, Prediction)) %>%
mutate(Prediction = ifelse(Prediction <= threshold, 0, 1)) %>%
mutate(Accuracy = ifelse(Prediction == Observed, 1, 0)) %>%
group_by(Observed) %>%
summarise(Accuracy = mean(Accuracy))
tnr <- subthreshold %>%
filter(Observed == 0) %>%
dplyr::select(Accuracy) %>%
as.numeric()
tpr <- subthreshold %>%
filter(Observed == 1) %>%
dplyr::select(Accuracy) %>%
as.numeric()
partial <- tibble(Threshold = threshold,
TNR = tnr,
TPR = tpr)
if (is.null(curve)) {
curve <- partial
} else {
curve <- rbind(curve, partial)
}
}
s <- ggplot(arrange(curve, TPR), aes(x=TNR, y=TPR)) +
geom_point(size=2, col="red", alpha=0.5) +
geom_line(col="red") +
ylab("Sensitivity (True Positive Rate)") +
xlab("Specificity (True Negative Rate)") +
scale_x_reverse() +
# ylim(0, 1) +
# xlim(1, 0) +
ggtitle("ROC Curve for Different Thresholds") +
geom_abline(slope=1, intercept = 1, col="grey", linetype = "dashed") +
theme_pander()
s
}
Use assess.glmnet() and confusion.glmnet() to evaluate model performance
# evaluate training accuracy
fit.cv.accuracy <- 1-assess.glmnet(fit.cv,
newx = X[train, ], newy = Y[train],weights = W[train],
s="lambda.min",
family = "binomial")$class %>% as.vector()
fit.cv.accuracy_testing <- 1-assess.glmnet(fit.cv,
newx = X[test, ], newy = Y[test],weights = W[test],
s="lambda.min",
family = "binomial")$class %>% as.vector()
fit.cv.auc <- assess.glmnet(fit.cv,
newx = X[train, ], newy = Y[train],weights = W[train],
s="lambda.min",
family = "binomial")$auc %>% as.vector()
fit.cv.auc_testing <- assess.glmnet(fit.cv,
newx = X[test, ], newy = Y[test],weights = W[test],
s="lambda.min",
family = "binomial")$auc %>% as.vector()
# training data prediction probabilities
fit.cv.pred <- predict(fit.cv, newx = X[train, ],
weights = W[train],
s="lambda.min",
type="class",
family = "binomial")%>% as.numeric()
fit.cv.predprob <- predict(fit.cv,
newx = X[train, ],
weights = W[train],
s="lambda.min",
type="response", family = "binomial")%>% as.numeric()
Calculate training Accuracy score (0.982078) and AUC score (0.9945356)
Predicted vs. Observations
score <- round(1-as.vector(assess.glmnet(fit.cv, X, Y, W, "binomial")$class), 4)
plot_prediction(Y[train], fit.cv.predprob, score, "(Training)")
ROC Curve
score <- round(as.vector(assess.glmnet(fit.cv, X, Y, W, "binomial")$auc), 4)
plot_roc(Y[train], fit.cv.predprob, score, "Training")
ROC Curve By Sliding Threshold
plot_roc_slide(Y[train], fit.cv.predprob, "Training")
Let’s have a better look at the relevant connections that survive the Lass penalty at the value of \(\lambda_{min} + 1 SE\). We start by saving these connections in a tibble, and saving the data on a file for later use.
#betas <- fit$beta[, which(fit$lambda==fit.cv$lambda.1se)]
betas <- fit$beta[, which(fit$lambda==fit.cv$lambda.min)]
conn_betas <- as_tibble(data.frame(index=I$index, Beta=betas))
connectome <- order %>%
filter(index %in% I$index) %>%
inner_join(conn_betas) %>%
dplyr::select(-censor2) %>%
filter(Beta != 0) %>%
# reformat connectome
separate(connection, c("connection1", "connection2"))%>%
separate(network, sep = "-", c("network1", "network2"), remove = F) %>%
#filter(str_detect(network, pattern = "-1-")) %>%
mutate(network1 = ifelse(str_detect(network, pattern = "-1-"), -1, network1)) %>%
mutate(connection_type = ifelse(network1==network2, "Within", "Between")) %>%
arrange(index)
# HARD CODE
connectome[connectome$network=="-1-5","network2"] <- "5"
connectome[connectome$network=="-1-7","network2"] <- "7"
connectome[connectome$network=="-1--1","network2"] <- "-1"
connectome[connectome$network=="-1-11","network2"] <- "11"
connectome[connectome$network=="-1-12","network2"] <- "12"
connectome[connectome$network=="-1-13","network2"] <- "13"
In sum, connectome has 70 non-zero connections, 24 positive beta, and 46 negative betas. We will use these betas for later brain connectivity analysis.
Finally, we can visualize the table of connections
connectome %>%
xtable() %>%
kable(digits = 5) %>%
kable_styling(bootstrap_options = c("striped", "hover"))
| index | network | network1 | network2 | network_names | connection1 | connection2 | censor | Beta | connection_type |
|---|---|---|---|---|---|---|---|---|---|
| 6092 | 1-1 | 1 | 1 | Sensory/somatomotor Hand-Sensory/somatomotor Hand | 20 | 24 | TRUE | -1.02642 | Within |
| 6620 | 1-1 | 1 | 1 | Sensory/somatomotor Hand-Sensory/somatomotor Hand | 20 | 26 | TRUE | -0.26556 | Within |
| 7681 | 1-1 | 1 | 1 | Sensory/somatomotor Hand-Sensory/somatomotor Hand | 25 | 30 | TRUE | -0.95704 | Within |
| 9261 | 1-1 | 1 | 1 | Sensory/somatomotor Hand-Sensory/somatomotor Hand | 21 | 36 | TRUE | -0.41736 | Within |
| 9264 | 1-1 | 1 | 1 | Sensory/somatomotor Hand-Sensory/somatomotor Hand | 24 | 36 | TRUE | -1.24174 | Within |
| 9266 | 1-1 | 1 | 1 | Sensory/somatomotor Hand-Sensory/somatomotor Hand | 26 | 36 | TRUE | 0.57663 | Within |
| 10049 | 1-1 | 1 | 1 | Sensory/somatomotor Hand-Sensory/somatomotor Hand | 17 | 39 | TRUE | 0.96682 | Within |
| 10063 | 1-1 | 1 | 1 | Sensory/somatomotor Hand-Sensory/somatomotor Hand | 31 | 39 | TRUE | 0.05859 | Within |
| 10586 | 1-1 | 1 | 1 | Sensory/somatomotor Hand-Sensory/somatomotor Hand | 26 | 41 | TRUE | -2.21316 | Within |
| 12434 | 1-3 | 1 | 3 | Sensory/somatomotor Hand-Cingulo-opercular Task Control | 26 | 48 | TRUE | 3.50087 | Between |
| 16916 | 1-4 | 1 | 4 | Sensory/somatomotor Hand-Auditory | 20 | 65 | TRUE | -1.96573 | Between |
| 17736 | 3-4 | 3 | 4 | Cingulo-opercular Task Control-Auditory | 48 | 68 | TRUE | -1.27235 | Between |
| 20077 | 1-5 | 1 | 5 | Sensory/somatomotor Hand-Default mode | 13 | 77 | TRUE | -1.40499 | Between |
| 24368 | 5-5 | 5 | 5 | Default mode-Default mode | 80 | 93 | TRUE | 0.44822 | Within |
| 24907 | 5-5 | 5 | 5 | Default mode-Default mode | 91 | 95 | TRUE | -4.35552 | Within |
| 25167 | 5-5 | 5 | 5 | Default mode-Default mode | 87 | 96 | TRUE | -4.90869 | Within |
| 25424 | 5-5 | 5 | 5 | Default mode-Default mode | 80 | 97 | TRUE | -3.28512 | Within |
| 26222 | 5-5 | 5 | 5 | Default mode-Default mode | 86 | 100 | TRUE | -1.66302 | Within |
| 26497 | 5-5 | 5 | 5 | Default mode-Default mode | 97 | 101 | TRUE | -0.21799 | Within |
| 27030 | 5-5 | 5 | 5 | Default mode-Default mode | 102 | 103 | TRUE | 0.04487 | Within |
| 27823 | 5-5 | 5 | 5 | Default mode-Default mode | 103 | 106 | TRUE | -0.04396 | Within |
| 28059 | 5-5 | 5 | 5 | Default mode-Default mode | 75 | 107 | TRUE | -0.70367 | Within |
| 29147 | 5-5 | 5 | 5 | Default mode-Default mode | 107 | 111 | TRUE | 1.43087 | Within |
| 30208 | 5-5 | 5 | 5 | Default mode-Default mode | 112 | 115 | TRUE | 0.60591 | Within |
| 31499 | 5-5 | 5 | 5 | Default mode-Default mode | 83 | 120 | TRUE | -2.34745 | Within |
| 31503 | 5-5 | 5 | 5 | Default mode-Default mode | 87 | 120 | TRUE | 2.42777 | Within |
| 34438 | 5-5 | 5 | 5 | Default mode-Default mode | 118 | 131 | TRUE | 4.58621 | Within |
| 35205 | 5-6 | 5 | 6 | Default mode-Memory retrieval? | 93 | 134 | TRUE | -6.30492 | Between |
| 35510 | 6-6 | 6 | 6 | Memory retrieval?-Memory retrieval? | 134 | 135 | TRUE | -0.63197 | Within |
| 36698 | -1–1 | -1 | -1 | Uncertain-Uncertain | 2 | 140 | TRUE | 2.41711 | Between |
| 40807 | 7-7 | 7 | 7 | Visual-Visual | 151 | 155 | TRUE | -3.78352 | Within |
| 41075 | 7-7 | 7 | 7 | Visual-Visual | 155 | 156 | TRUE | -0.24790 | Within |
| 42053 | 5-7 | 5 | 7 | Default mode-Visual | 77 | 160 | TRUE | -0.11056 | Between |
| 42927 | 7-7 | 7 | 7 | Visual-Visual | 159 | 163 | TRUE | -7.89760 | Within |
| 43423 | 5-7 | 5 | 7 | Default mode-Visual | 127 | 165 | TRUE | 5.04510 | Between |
| 44756 | -1-7 | -1 | 7 | Uncertain-Visual | 140 | 170 | TRUE | -1.37508 | Between |
| 45044 | 7-7 | 7 | 7 | Visual-Visual | 164 | 171 | TRUE | -0.64188 | Within |
| 45309 | 7-7 | 7 | 7 | Visual-Visual | 165 | 172 | TRUE | 3.80543 | Within |
| 45566 | 7-7 | 7 | 7 | Visual-Visual | 158 | 173 | TRUE | 5.10282 | Within |
| 47111 | 5-8 | 5 | 8 | Default mode-Fronto-parietal Task Control | 119 | 179 | TRUE | 1.15366 | Between |
| 47145 | 7-8 | 7 | 8 | Visual-Fronto-parietal Task Control | 153 | 179 | TRUE | 1.13669 | Between |
| 48912 | 4-8 | 4 | 8 | Auditory-Fronto-parietal Task Control | 72 | 186 | TRUE | -1.24596 | Between |
| 49015 | 8-8 | 8 | 8 | Fronto-parietal Task Control-Fronto-parietal Task Control | 175 | 186 | TRUE | -0.17310 | Within |
| 49555 | 8-8 | 8 | 8 | Fronto-parietal Task Control-Fronto-parietal Task Control | 187 | 188 | TRUE | -0.42866 | Within |
| 51039 | 5-8 | 5 | 8 | Default mode-Fronto-parietal Task Control | 87 | 194 | TRUE | -3.08212 | Between |
| 51142 | 8-8 | 8 | 8 | Fronto-parietal Task Control-Fronto-parietal Task Control | 190 | 194 | TRUE | -2.89211 | Within |
| 51144 | 8-8 | 8 | 8 | Fronto-parietal Task Control-Fronto-parietal Task Control | 192 | 194 | TRUE | 0.10565 | Within |
| 51350 | 6-8 | 6 | 8 | Memory retrieval?-Fronto-parietal Task Control | 134 | 195 | TRUE | -4.01126 | Between |
| 52987 | 8-8 | 8 | 8 | Fronto-parietal Task Control-Fronto-parietal Task Control | 187 | 201 | TRUE | 4.30814 | Within |
| 53356 | 1-9 | 1 | 9 | Sensory/somatomotor Hand-Salience | 28 | 203 | TRUE | 4.73972 | Between |
| 54042 | 8-9 | 8 | 9 | Fronto-parietal Task Control-Salience | 186 | 205 | TRUE | -6.39182 | Between |
| 55649 | 9-9 | 9 | 9 | Salience-Salience | 209 | 211 | TRUE | -2.10899 | Within |
| 56170 | 8-9 | 8 | 9 | Fronto-parietal Task Control-Salience | 202 | 213 | TRUE | 4.02334 | Between |
| 57502 | 9-9 | 9 | 9 | Salience-Salience | 214 | 218 | TRUE | 0.57233 | Within |
| 57766 | 9-9 | 9 | 9 | Salience-Salience | 214 | 219 | TRUE | -0.03052 | Within |
| 59359 | 10-10 | 10 | 10 | Subcortical-Subcortical | 223 | 225 | TRUE | -7.29251 | Within |
| 60155 | 10-10 | 10 | 10 | Subcortical-Subcortical | 227 | 228 | TRUE | -5.03139 | Within |
| 60235 | 2-10 | 2 | 10 | Sensory/somatomotor Mouth-Subcortical | 43 | 229 | TRUE | -5.85398 | Between |
| 61745 | 10-10 | 10 | 10 | Subcortical-Subcortical | 233 | 234 | TRUE | -6.12406 | Within |
| 62372 | 4-11 | 4 | 11 | Auditory-Ventral attention | 68 | 237 | TRUE | -0.93681 | Between |
| 62630 | 4-11 | 4 | 11 | Auditory-Ventral attention | 62 | 238 | TRUE | -2.80812 | Between |
| 63462 | 5-11 | 5 | 11 | Default mode-Ventral attention | 102 | 241 | TRUE | -0.37644 | Between |
| 63860 | 11-11 | 11 | 11 | Ventral attention-Ventral attention | 236 | 242 | TRUE | -3.73544 | Within |
| 64923 | 13-13 | 13 | 13 | Cerebellar-Cerebellar | 243 | 246 | TRUE | 0.62941 | Within |
| 66013 | 1-12 | 1 | 12 | Sensory/somatomotor Hand-Dorsal attention | 13 | 251 | TRUE | -0.28276 | Between |
| 68104 | 12-12 | 12 | 12 | Dorsal attention-Dorsal attention | 256 | 258 | TRUE | -1.02315 | Within |
| 68570 | 8-12 | 8 | 12 | Fronto-parietal Task Control-Dorsal attention | 194 | 260 | TRUE | 2.28855 | Between |
| 68632 | 12-12 | 12 | 12 | Dorsal attention-Dorsal attention | 256 | 260 | TRUE | -0.62532 | Within |
| 68818 | 8-12 | 8 | 12 | Fronto-parietal Task Control-Dorsal attention | 178 | 261 | TRUE | -1.77074 | Between |
| 69481 | 3-12 | 3 | 12 | Cingulo-opercular Task Control-Dorsal attention | 49 | 264 | TRUE | 4.96137 | Between |
And now, let’s visualize the beta weights of the connections: num connections=70
connectome %>%
mutate(beta_sign = ifelse(Beta >0, "+", "-")) %>%
ggdotchart(x = "network_names", y = "Beta",
color = "beta_sign", # Color by groups
palette = c("steelblue", "tomato"), # Custom color palette
rotate = TRUE,
facet.by = "connection_type",
sort.by.groups = F,
sort.val = "desc", # Sort the value in descending order
sorting = "descending", # Sort value in descending order
add = "segments", # Add segments from y = 0 to dots
add.params = list(color = "lightgray", size = 2), # Change segment color and size
group = "connection_type", # Order by groups
dot.size = 3, # Large dot size
#label = round(connectome$Beta,2), # Add mpg values as dot labels
#font.label = list(color = "white", size = 9,
# vjust = 0.5), # Adjust label parameters
#group = "cyl",
#y.text.col = TRUE,
title = paste("Lasso Connection Weights:", dim(connectome)[[1]]),
ggtheme = theme_pander()) +
geom_hline(yintercept = 0, linetype = 2, color = "black")
Lasso vs. Power
subsetPower <- filter(power2011,
NetworkName %in% NOI)
noi_stats <- subsetPower %>%
group_by(NetworkName, Color) %>%
summarise(N=length(Color)/dim(subsetPower)[1]) %>%
add_column(Source="Power")
lROIs <- unique(c(connectome$connection1, connectome$connection2))
rois <- power2011[lROIs,]
roi_stats <- rois %>%
group_by(NetworkName, Color, .drop = F) %>%
summarise(N=length(Color)/length(lROIs)) %>%
add_column(Source="Lasso")
roi_stats <- roi_stats %>%
right_join(noi_stats %>% dplyr::select(NetworkName, Color), on = c("NetworkName", "Color")) %>%
mutate(N = ifelse(is.na(N), 0, N), Source="Lasso") %>%
arrange(NetworkName)
total_stats <- rbind(noi_stats, roi_stats)
ggplot(total_stats, aes(x="", y=N, fill=NetworkName)) +
geom_bar(stat = "identity", col="white", width=1) +
facet_grid(~Source, labeller = label_both) +
coord_polar("y", start=0) +
scale_y_continuous(labels = scales::percent_format(accuracy = 2L)) +
scale_fill_manual(values = unique(power2011$Color)) +
#scale_fill_viridis(discrete = T) +
#scale_fill_ucscgb() +
ylab("") +
xlab("") +
ggtitle("Distriution of ROI") +
geom_text_repel(aes(label=percent(N, .1)), col="black",
position=position_stack(vjust=.01), size=3)+
theme_pander() +
guides(fill=guide_legend(ncol=2)) +
theme(legend.position = "bottom")
#ggbarplot(total_stats, x="NetworkName", y="N", facet.by = "Source", fill = "NetworkName", color = "white",
# merge = T, label = F,
# ) +
# coord_polar("y", start=0)
chisq.test(roi_stats$N*length(lROIs), p = noi_stats$N)
##
## Chi-squared test for given probabilities
##
## data: roi_stats$N * length(lROIs)
## X-squared = 13.689, df = 13, p-value = 0.3961
Lasso vs. Power:
Between vs. Within
net_from <- function(s) {as.character(strsplit(s, "-")[[1]][1])}
net_to <- function(s) {as.character(strsplit(s, "-")[[1]][2])}
vnet_from <- Vectorize(net_from)
vnet_to <- Vectorize(net_to)
connectivity <- function(s) {
if (net_from(s) == net_to(s)) {
"Within"
} else {
"Between"
}
}
vconnectivity <- Vectorize(connectivity)
coi <- order %>%
filter(censor == TRUE) %>%
filter(network_names %in% COI)
coi$from <- vnet_from(coi$network_names)
coi$to <- vnet_to(coi$network_names)
coi$connection_type <- vconnectivity(coi$network_names)
coi_stats <- coi %>%
group_by(connection_type) %>%
summarise(N=length(index), P=length(index)/dim(coi)[1]) %>%
add_column(Source = "Power et al. (2011)")
connectome$connection_type <- vconnectivity(connectome$network_names)
connectome_stats <- connectome %>%
group_by(connection_type) %>%
summarise(N=length(index), P=length(index)/dim(connectome)[1]) %>%
add_column(Source = "Lasso Analysis")
connect_stats <- rbind(connectome_stats, coi_stats)
ggplot(connect_stats, aes(x="", y=P, fill=connection_type)) +
geom_bar(stat = "identity", col="grey", width=1) +
facet_grid(~Source, labeller = label_both) +
coord_polar("y", start=0) +
scale_y_continuous(labels = scales::percent_format(accuracy = 2L)) +
scale_fill_jama() +
scale_color_jama() +
ylab("") +
xlab("") +
ggtitle("Distribuction of Connectivity\n(Within/Between Networks)") +
geom_text_repel(aes(label=percent(P, .1)), col="white",
position=position_stack(vjust=1), size=3)+
theme_pander() +
theme(legend.position = "bottom")
chisq.test(connectome_stats$N, p=coi_stats$P)
##
## Chi-squared test for given probabilities
##
## data: connectome_stats$N
## X-squared = 191.64, df = 1, p-value < 2.2e-16
In nested CV, lambda is not same for each subject
Cher’s method of nested cross-validation
SKIP <- FALSE
if (SKIP) {
rm(list = ls())
load(file = "../data/__cache__/NESTED_CV.RData")
} else {
set.seed(0)
nrounds <- 200
nfolds <- 20
#ntest <- 30
n = length(Y)
# X size = 230 x 640
nP <- ncol(X)
nO <- nrow(X)
# Training Log
Yp.train <- matrix(rep(NA, nO * nO), ncol = nO) %>% as.data.frame() # Vector of zeros the size of 176 x 176
# Prediction Log
nested.train.Yp <- matrix(rep(NA, nO * nrounds), ncol = nrounds)
nested.train.Ypp <- matrix(rep(NA, nO * nrounds), ncol = nrounds)
nested.test.Yp <- matrix(rep(NA, nO * nrounds), ncol = nrounds)
nested.test.Ypp <- matrix(rep(NA, nO * nrounds), ncol = nrounds)
### Score Log
nested.train.errorscore <- c()
nested.train.aucscore <- c()
nested.test.errorscore <- c()
nested.test.acuscore <- c()
### Coefs Log
Xcoef <- matrix(rep(NA, nP * nrounds), ncol = nrounds) # Matrix of zeros the dimensions of X (640 x 200)
### Best Lambda Log
nested.lambdas <- c()
#colnames(Xco) <- paste("s", 1:numO, sep="")
#rownames(Xco) <- paste("b", 1:numP, sep="")
for(i in 1:nrounds) {
tryCatch({
# split train/test
train = sample(1:n, 3*n/4) # by default replace=FALSE in sample()
test = (1:n)[-train]
iW <- Y[train]
iW[iW == 0] <- mean(Y[train])
iW[iW == 1] <- (1-mean(Y[train]))
ilasso <- glmnet(x=X[train, ], y=Y[train],
alpha=1,
weights = iW,
family = "binomial",
type.measure = "class",
standardize=F)
ilasso.cv <- cv.glmnet(x=X[train, ], y=Y[train],
alpha=1,
weights=iW,
#penalty="SCAD",
family = "binomial",
type.measure = "class",
standardize=T,
grouped=F,
nfolds=nfolds)
# define best lambda
bestlambda <- ilasso.cv$lambda.min
nested.lambdas <- c(nested.lambdas, bestlambda)
# validation data misclassification error
ilasso.cv.error <- assess.glmnet(ilasso.cv, X[train,], Y[train], weights = W[train], s=bestlambda, family = "binomial")$class %>% as.vector()# best lambda cv error
ilasso.cv.auc <- assess.glmnet(ilasso.cv, X[train,], Y[train], weights = W[train], s=bestlambda, family = "binomial")$auc %>% as.vector()# best lambda cv error
# training data prediction probabilities
ilasso.cv.pred <- predict(ilasso.cv, newx = X[train,], weights = W[train], s=bestlambda, type="class", family = "binomial")%>% as.numeric()
ilasso.cv.predprob <- predict(ilasso.cv, newx = X[train,], weights = W[train], s=bestlambda, type="response", family = "binomial")%>% as.numeric()
# testing data misclassification error
ilasso.test.error <-assess.glmnet(ilasso.cv, X[test,], Y[test], weights = W[test], s=bestlambda, family = "binomial")$class %>% as.vector()# best lambda cv error
ilasso.test.auc <-assess.glmnet(ilasso.cv, X[test,], Y[test], weights = W[test], s=bestlambda, family = "binomial")$auc %>% as.vector()# best lambda cv error
# training data prediction probabilities
ilasso.test.pred <- predict(ilasso.cv, newx = X[test,], weights = W[test], s=bestlambda, type="class", family = "binomial")%>% as.numeric()
ilasso.test.predprob <- predict(ilasso.cv, newx = X[test,], weights = W[test], s=bestlambda, type="response", family = "binomial")%>% as.numeric()
# coeff
B <- coef(ilasso.cv, s=bestlambda)[-1,] # do not keep intercept
### LOG Score
nested.train.errorscore <- c(nested.train.errorscore, ilasso.cv.error)
nested.train.aucscore <- c(nested.train.aucscore, ilasso.cv.auc)
nested.test.errorscore <- c(nested.test.errorscore, ilasso.test.error)
nested.test.acuscore <- c(nested.test.acuscore, ilasso.test.auc)
### LOG Coefs
Xcoef[,i] <- B
### LOG Prediction
nested.train.Yp[train,i] <- ilasso.cv.pred
nested.train.Ypp[train,i] <- ilasso.cv.predprob
nested.test.Yp[test,i] <- ilasso.test.pred
nested.test.Ypp[test,i] <- ilasso.test.predprob
# print(paste('running i =', i, 'best lambda:', round(bestlambda,4), "test.error", round(ilasso.test.error,4), "train[1] index", train[1]))
}, error=function(e){
print(paste('i=', i, "Uhhh, error\n"))
})
}
#save.image(file = "../data/__cache__/NESTED_CV.RData")
}
We could compare the distribution of nested best vs. fit.cv$lambda.min
gghistogram(nested.lambdas, color = "darkgray", fill = "darkgray", bins = 50,
title = "Nested CV Lambdas vs. Regular CV Lambda",
xlab = "Nested CV Lambdas", add = "median", rug = TRUE) +
geom_vline(aes(xintercept = fit.cv$lambda.min), col='red') +
geom_vline(aes(xintercept = fit.cv$lambda.1se), col='red') +
geom_text(aes(label=paste("median\n", round(median(nested.lambdas),4)), y=15, x=mean(nested.lambdas))) +
geom_text(aes(label=paste("min lambda\n", round(fit.cv$lambda.min,4)), y=10, x=fit.cv$lambda.min)) +
geom_text(aes(label=paste("1se lambda\n", round(fit.cv$lambda.1se,4)), y=10, x=fit.cv$lambda.1se))
The averaged Nested CV lambdas is 0.0200587, the median is 0.0122468
If we weighted by accuracy score, the weighted mean lambda is 0.0193296. Since there is no big difference between weighted mean and mean, we could simply use the mean/median directly.
Next, given the best lambda from nested CV, we could refit the LASSO model and see the model accuracy
nested.lambda <- median(nested.lambdas)
# find optimal lambda using training dataset
nested.fit = glmnet(X, Y,
alpha = 1,
lambda = nested.lambda,
family = "binomial",
weights = W,
type.measure = "class",
standardize = T,
keep = T,
nfolds = length(Y),
grouped = T)
nested.fit.cv <- cv.glmnet(y = Y,
x = X,
alpha=1,
lambda = c(nested.lambda,
fit.cv$lambda.min),
family = "binomial",
weights = W,
type.measure = "class",
standardize=T,
nfolds=length(Y),
grouped = F,
keep = T)
plot(fit, sub="Beta Values for Connectivity")
L1norm <- sum(abs(fit$beta[,which(round(fit$lambda, 3) == round(nested.lambda, 3))]))
abline(v=L1norm, lwd=2, lty=2)
# training data prediction probabilities
nested.fit.pred <- predict(nested.fit, newx = X,
weights = W, type="class", family = "binomial")%>% as.numeric()
nested.fit.predprob <- predict(nested.fit, newx = X,
weights = W, type="response", family = "binomial")%>% as.numeric()
Accuracy
# calculate cross-validation accuracy (LOO)
score <- round(1-as.vector(assess.glmnet(object = nested.fit.cv,
newx = X,
newy = Y,
weights = W,
family="binomial")$class), 4) * 100
plot_prediction(Y, nested.fit.pred, score, "(Nested CV Refit)")
score <- round(as.vector(assess.glmnet(object = nested.fit.cv,
newx = X,
newy = Y,
weights = W,
family = "binomial")$auc), 4)
plot_roc(Y, nested.fit.predprob, score, "(Nested CV Refit)")
plot_roc_slide(Y, nested.fit.predprob, "(Nested CV Refit)")
Visualize Prediction vs. Observed on Training and Testing data ’ The Yp is calculated by averaged across each round
Accuracy
plot_prediction(Y, apply(nested.train.Ypp, 1, mean, na.rm=T), -1, "(Averaged Training)")
plot_prediction(Y, apply(nested.test.Ypp, 1, mean, na.rm=T), -1, "(Averaged Testing)")
ROC
plot_roc(Y, apply(nested.train.Ypp, 1, mean, na.rm=T), -1, "(Averaged Training)")
plot_roc(Y, apply(nested.test.Ypp, 1, mean, na.rm=T), -1, "(Averaged Testing)")
ROC By Sliding Threshold
plot_roc_slide(Y, apply(nested.train.Ypp, 1, mean, na.rm=T), "(Averaged Training)")
plot_roc_slide(Y, apply(nested.test.Ypp, 1, mean, na.rm=T), "(Averaged Testing)")
### LOG Score
nested.df <- data.frame(score=1-nested.train.errorscore, data_type="train", score_type="Accuracy", rounds = seq(1:nrounds)) %>%
rbind(data.frame(score=1-nested.test.errorscore, data_type="test", score_type="Accuracy", rounds = seq(1:nrounds))) %>%
rbind(data.frame(score=nested.train.aucscore, data_type="train", score_type="AUC", rounds = seq(1:nrounds))) %>%
rbind(data.frame(score=nested.test.acuscore, data_type="test", score_type="AUC", rounds = seq(1:nrounds)))
ggboxplot(data=nested.df, x="data_type" , y="score",
color = "data_type", #fill = "score_type",
facet.by = "score_type", add = "jitter") +
geom_hline(yintercept = 0.5, col = "gray", line_type=1) +
ggtitle("Nested CV: AUC and Accuracy for both Training and Testing data") +
ylim(0,1) +
theme_pander()
The left skewed distribution of Betas is a good sign that betas do not change significantly across CV Folds
Instead of threshholding betas, we decide to use nested cv to select best lambda, and refit LASSO model. Thus, the beta were fit by whole dataset
betas <- nested.fit$beta %>% as.vector()
conn_betas <- as_tibble(data.frame(index=I$index, Beta=betas))
connectome <- order %>%
filter(index %in% I$index) %>%
inner_join(conn_betas) %>%
dplyr::select(-censor2) %>%
filter(Beta != 0) %>%
# reformat connectome
separate(connection, c("connection1", "connection2"))%>%
separate(network, sep = "-", c("network1", "network2"), remove = F) %>%
#filter(str_detect(network, pattern = "-1-")) %>%
mutate(network1 = ifelse(str_detect(network, pattern = "-1-"), -1, network1)) %>%
mutate(connection_type = ifelse(network1==network2, "Within", "Between")) %>%
arrange(index)
# HARD CODE
connectome[connectome$network=="-1-5","network2"] <- "5"
connectome[connectome$network=="-1-7","network2"] <- "7"
connectome[connectome$network=="-1--1","network2"] <- "-1"
connectome[connectome$network=="-1-11","network2"] <- "11"
connectome[connectome$network=="-1-12","network2"] <- "12"
connectome[connectome$network=="-1-13","network2"] <- "13"
connectome_nested <- connectome
connectome %>%
xtable() %>%
kable(digits = 5) %>%
kable_styling(bootstrap_options = c("striped", "hover"))
| index | network | network1 | network2 | network_names | connection1 | connection2 | censor | Beta | connection_type |
|---|---|---|---|---|---|---|---|---|---|
| 5562 | 1-1 | 1 | 1 | Sensory/somatomotor Hand-Sensory/somatomotor Hand | 18 | 22 | TRUE | -1.18264 | Within |
| 6092 | 1-1 | 1 | 1 | Sensory/somatomotor Hand-Sensory/somatomotor Hand | 20 | 24 | TRUE | -0.19964 | Within |
| 6620 | 1-1 | 1 | 1 | Sensory/somatomotor Hand-Sensory/somatomotor Hand | 20 | 26 | TRUE | -1.86371 | Within |
| 7681 | 1-1 | 1 | 1 | Sensory/somatomotor Hand-Sensory/somatomotor Hand | 25 | 30 | TRUE | -1.55529 | Within |
| 9261 | 1-1 | 1 | 1 | Sensory/somatomotor Hand-Sensory/somatomotor Hand | 21 | 36 | TRUE | -2.27400 | Within |
| 9264 | 1-1 | 1 | 1 | Sensory/somatomotor Hand-Sensory/somatomotor Hand | 24 | 36 | TRUE | -3.45859 | Within |
| 9266 | 1-1 | 1 | 1 | Sensory/somatomotor Hand-Sensory/somatomotor Hand | 26 | 36 | TRUE | 0.44346 | Within |
| 9781 | 1-1 | 1 | 1 | Sensory/somatomotor Hand-Sensory/somatomotor Hand | 13 | 38 | TRUE | 0.26553 | Within |
| 10049 | 1-1 | 1 | 1 | Sensory/somatomotor Hand-Sensory/somatomotor Hand | 17 | 39 | TRUE | 0.95880 | Within |
| 10063 | 1-1 | 1 | 1 | Sensory/somatomotor Hand-Sensory/somatomotor Hand | 31 | 39 | TRUE | 1.95909 | Within |
| 10311 | 1-1 | 1 | 1 | Sensory/somatomotor Hand-Sensory/somatomotor Hand | 15 | 40 | TRUE | 2.39523 | Within |
| 10586 | 1-1 | 1 | 1 | Sensory/somatomotor Hand-Sensory/somatomotor Hand | 26 | 41 | TRUE | -2.82731 | Within |
| 12434 | 1-3 | 1 | 3 | Sensory/somatomotor Hand-Cingulo-opercular Task Control | 26 | 48 | TRUE | 5.24795 | Between |
| 16916 | 1-4 | 1 | 4 | Sensory/somatomotor Hand-Auditory | 20 | 65 | TRUE | -1.91541 | Between |
| 16960 | 4-4 | 4 | 4 | Auditory-Auditory | 64 | 65 | TRUE | -0.33755 | Within |
| 17223 | 4-4 | 4 | 4 | Auditory-Auditory | 63 | 66 | TRUE | 0.25054 | Within |
| 17467 | 2-4 | 2 | 4 | Sensory/somatomotor Mouth-Auditory | 43 | 67 | TRUE | -0.14323 | Between |
| 17736 | 3-4 | 3 | 4 | Cingulo-opercular Task Control-Auditory | 48 | 68 | TRUE | -1.87333 | Between |
| 18017 | 4-4 | 4 | 4 | Auditory-Auditory | 65 | 69 | TRUE | 1.45966 | Within |
| 20077 | 1-5 | 1 | 5 | Sensory/somatomotor Hand-Default mode | 13 | 77 | TRUE | -2.36907 | Between |
| 21995 | -1-5 | -1 | 5 | Uncertain-Default mode | 83 | 84 | TRUE | 0.80811 | Between |
| 24368 | 5-5 | 5 | 5 | Default mode-Default mode | 80 | 93 | TRUE | 0.61002 | Within |
| 24907 | 5-5 | 5 | 5 | Default mode-Default mode | 91 | 95 | TRUE | -5.32611 | Within |
| 25166 | 5-5 | 5 | 5 | Default mode-Default mode | 86 | 96 | TRUE | -0.42336 | Within |
| 25167 | 5-5 | 5 | 5 | Default mode-Default mode | 87 | 96 | TRUE | -4.72565 | Within |
| 25424 | 5-5 | 5 | 5 | Default mode-Default mode | 80 | 97 | TRUE | -4.55376 | Within |
| 26222 | 5-5 | 5 | 5 | Default mode-Default mode | 86 | 100 | TRUE | -2.24774 | Within |
| 26497 | 5-5 | 5 | 5 | Default mode-Default mode | 97 | 101 | TRUE | -0.66724 | Within |
| 27030 | 5-5 | 5 | 5 | Default mode-Default mode | 102 | 103 | TRUE | 0.73877 | Within |
| 27823 | 5-5 | 5 | 5 | Default mode-Default mode | 103 | 106 | TRUE | -0.54149 | Within |
| 28059 | 5-5 | 5 | 5 | Default mode-Default mode | 75 | 107 | TRUE | -1.40722 | Within |
| 28619 | 5-5 | 5 | 5 | Default mode-Default mode | 107 | 109 | TRUE | -0.13522 | Within |
| 29147 | 5-5 | 5 | 5 | Default mode-Default mode | 107 | 111 | TRUE | 3.06813 | Within |
| 30208 | 5-5 | 5 | 5 | Default mode-Default mode | 112 | 115 | TRUE | 1.99039 | Within |
| 30972 | -1-5 | -1 | 5 | Uncertain-Default mode | 84 | 118 | TRUE | -0.51322 | Between |
| 31499 | 5-5 | 5 | 5 | Default mode-Default mode | 83 | 120 | TRUE | -2.72456 | Within |
| 31503 | 5-5 | 5 | 5 | Default mode-Default mode | 87 | 120 | TRUE | 2.53049 | Within |
| 34438 | 5-5 | 5 | 5 | Default mode-Default mode | 118 | 131 | TRUE | 6.54614 | Within |
| 35205 | 5-6 | 5 | 6 | Default mode-Memory retrieval? | 93 | 134 | TRUE | -7.83106 | Between |
| 35510 | 6-6 | 6 | 6 | Memory retrieval?-Memory retrieval? | 134 | 135 | TRUE | -2.33795 | Within |
| 36698 | -1–1 | -1 | -1 | Uncertain-Uncertain | 2 | 140 | TRUE | 2.86588 | Between |
| 40009 | 7-7 | 7 | 7 | Visual-Visual | 145 | 152 | TRUE | -0.79136 | Within |
| 40807 | 7-7 | 7 | 7 | Visual-Visual | 151 | 155 | TRUE | -4.82491 | Within |
| 41075 | 7-7 | 7 | 7 | Visual-Visual | 155 | 156 | TRUE | -1.27670 | Within |
| 42053 | 5-7 | 5 | 7 | Default mode-Visual | 77 | 160 | TRUE | -0.58665 | Between |
| 42927 | 7-7 | 7 | 7 | Visual-Visual | 159 | 163 | TRUE | -9.68682 | Within |
| 43423 | 5-7 | 5 | 7 | Default mode-Visual | 127 | 165 | TRUE | 5.99168 | Between |
| 44242 | 7-7 | 7 | 7 | Visual-Visual | 154 | 168 | TRUE | 0.29713 | Within |
| 44756 | -1-7 | -1 | 7 | Uncertain-Visual | 140 | 170 | TRUE | -2.28477 | Between |
| 45309 | 7-7 | 7 | 7 | Visual-Visual | 165 | 172 | TRUE | 3.53798 | Within |
| 45566 | 7-7 | 7 | 7 | Visual-Visual | 158 | 173 | TRUE | 6.67788 | Within |
| 47111 | 5-8 | 5 | 8 | Default mode-Fronto-parietal Task Control | 119 | 179 | TRUE | 1.33650 | Between |
| 47145 | 7-8 | 7 | 8 | Visual-Fronto-parietal Task Control | 153 | 179 | TRUE | 0.96341 | Between |
| 48912 | 4-8 | 4 | 8 | Auditory-Fronto-parietal Task Control | 72 | 186 | TRUE | -1.37553 | Between |
| 49015 | 8-8 | 8 | 8 | Fronto-parietal Task Control-Fronto-parietal Task Control | 175 | 186 | TRUE | -1.86039 | Within |
| 49290 | 8-8 | 8 | 8 | Fronto-parietal Task Control-Fronto-parietal Task Control | 186 | 187 | TRUE | -0.07477 | Within |
| 49555 | 8-8 | 8 | 8 | Fronto-parietal Task Control-Fronto-parietal Task Control | 187 | 188 | TRUE | -0.29958 | Within |
| 51039 | 5-8 | 5 | 8 | Default mode-Fronto-parietal Task Control | 87 | 194 | TRUE | -4.91514 | Between |
| 51142 | 8-8 | 8 | 8 | Fronto-parietal Task Control-Fronto-parietal Task Control | 190 | 194 | TRUE | -3.76537 | Within |
| 51144 | 8-8 | 8 | 8 | Fronto-parietal Task Control-Fronto-parietal Task Control | 192 | 194 | TRUE | 1.31060 | Within |
| 51350 | 6-8 | 6 | 8 | Memory retrieval?-Fronto-parietal Task Control | 134 | 195 | TRUE | -3.72428 | Between |
| 52987 | 8-8 | 8 | 8 | Fronto-parietal Task Control-Fronto-parietal Task Control | 187 | 201 | TRUE | 5.82064 | Within |
| 53356 | 1-9 | 1 | 9 | Sensory/somatomotor Hand-Salience | 28 | 203 | TRUE | 7.01097 | Between |
| 54042 | 8-9 | 8 | 9 | Fronto-parietal Task Control-Salience | 186 | 205 | TRUE | -7.84346 | Between |
| 55649 | 9-9 | 9 | 9 | Salience-Salience | 209 | 211 | TRUE | -3.74355 | Within |
| 56170 | 8-9 | 8 | 9 | Fronto-parietal Task Control-Salience | 202 | 213 | TRUE | 5.86874 | Between |
| 57239 | 9-9 | 9 | 9 | Salience-Salience | 215 | 217 | TRUE | 0.66129 | Within |
| 57502 | 9-9 | 9 | 9 | Salience-Salience | 214 | 218 | TRUE | 0.21691 | Within |
| 57770 | 9-9 | 9 | 9 | Salience-Salience | 218 | 219 | TRUE | -0.83778 | Within |
| 59359 | 10-10 | 10 | 10 | Subcortical-Subcortical | 223 | 225 | TRUE | -5.72976 | Within |
| 60155 | 10-10 | 10 | 10 | Subcortical-Subcortical | 227 | 228 | TRUE | -4.59472 | Within |
| 60235 | 2-10 | 2 | 10 | Sensory/somatomotor Mouth-Subcortical | 43 | 229 | TRUE | -9.68227 | Between |
| 61745 | 10-10 | 10 | 10 | Subcortical-Subcortical | 233 | 234 | TRUE | -9.09830 | Within |
| 62372 | 4-11 | 4 | 11 | Auditory-Ventral attention | 68 | 237 | TRUE | -1.26320 | Between |
| 62630 | 4-11 | 4 | 11 | Auditory-Ventral attention | 62 | 238 | TRUE | -3.59832 | Between |
| 63462 | 5-11 | 5 | 11 | Default mode-Ventral attention | 102 | 241 | TRUE | -1.83858 | Between |
| 63860 | 11-11 | 11 | 11 | Ventral attention-Ventral attention | 236 | 242 | TRUE | -3.67256 | Within |
| 64923 | 13-13 | 13 | 13 | Cerebellar-Cerebellar | 243 | 246 | TRUE | 1.05428 | Within |
| 67045 | -1–1 | -1 | -1 | Uncertain-Uncertain | 253 | 254 | TRUE | -0.39058 | Between |
| 67489 | 7-12 | 7 | 12 | Visual-Dorsal attention | 169 | 256 | TRUE | -1.05588 | Between |
| 67745 | 7-12 | 7 | 12 | Visual-Dorsal attention | 161 | 257 | TRUE | 1.83564 | Between |
| 68103 | 1-12 | 1 | 12 | Sensory/somatomotor Hand-Dorsal attention | 255 | 258 | TRUE | 0.69438 | Between |
| 68104 | 12-12 | 12 | 12 | Dorsal attention-Dorsal attention | 256 | 258 | TRUE | -0.82699 | Within |
| 68570 | 8-12 | 8 | 12 | Fronto-parietal Task Control-Dorsal attention | 194 | 260 | TRUE | 3.35831 | Between |
| 68818 | 8-12 | 8 | 12 | Fronto-parietal Task Control-Dorsal attention | 178 | 261 | TRUE | -2.62027 | Between |
| 69481 | 3-12 | 3 | 12 | Cingulo-opercular Task Control-Dorsal attention | 49 | 264 | TRUE | 7.01556 | Between |
connectome %>%
mutate(beta_sign = ifelse(Beta >0, "+", "-")) %>%
ggdotchart(x = "network_names", y = "Beta",
color = "beta_sign", # Color by groups
palette = c("steelblue", "tomato"), # Custom color palette
rotate = TRUE,
facet.by = "connection_type",
sort.by.groups = F,
sort.val = "desc", # Sort the value in descending order
sorting = "descending", # Sort value in descending order
add = "segments", # Add segments from y = 0 to dots
add.params = list(color = "lightgray", size = 2), # Change segment color and size
group = "connection_type", # Order by groups
dot.size = 3, # Large dot size
#label = round(connectome$Beta,2), # Add mpg values as dot labels
#font.label = list(color = "white", size = 9,
# vjust = 0.5), # Adjust label parameters
#group = "cyl",
#y.text.col = TRUE,
title = paste("Lasso Connection Weights:", dim(connectome)[[1]]),
ggtheme = theme_pander()) +
geom_hline(yintercept = 0, linetype = 2, color = "black")
And now, let’s visualize the beta weights of the connections
connectome_nested %>%
mutate(beta_sign = ifelse(Beta >0, "+", "-")) %>%
ggdotchart(x = "network_names", y = "Beta",
color = "beta_sign", # Color by groups
palette = c("steelblue", "tomato"), # Custom color palette
rotate = TRUE,
facet.by = "connection_type",
sort.by.groups = F,
sort.val = "desc", # Sort the value in descending order
sorting = "descending", # Sort value in descending order
add = "segments", # Add segments from y = 0 to dots
add.params = list(color = "lightgray", size = 2), # Change segment color and size
group = "connection_type", # Order by groups
dot.size = 3, # Large dot size
#label = round(connectome$Beta,2), # Add mpg values as dot labels
#font.label = list(color = "white", size = 9,
# vjust = 0.5), # Adjust label parameters
#group = "cyl",
#y.text.col = TRUE,
title = paste("Lasso Connection Weights(Nested):", dim(connectome_nested)[[1]]),
ggtheme = theme_pander()) +
geom_hline(yintercept = 0, linetype = 2, color = "black")
Here, we will examine the quality of our Lasso model bu doing a series of tests.
In the ablation test, we remove all the connections with significant beta values, and check whether the results are still significant.
XX <- X[, conn_betas$Beta == 0]
fit_wo <- glmnet(y = Y,
x = XX,
alpha=1,
lambda = fit$lambda,
family = "binomial",
type.measure = "class",
weights = W,
standardize = T
)
fit_wo.cv <- cv.glmnet(y = Y,
x = XX,
alpha=1,
weights = W,
lambda = fit$lambda,
standardize=T,
type.measure = "class",
family = "binomial",
grouped=F,
nfolds=length(Y)
)
The model does converge, but its overall classification error is much higher.
plot(fit_wo, sub="Beta Values for Connectivity")
L1norm <- sum(abs(fit_wo$beta[,which(fit_wo$lambda==fit_wo.cv$lambda.min)]))
abline(v=L1norm, lwd=2, lty=2)
It is useful to plot the two \(\lambda\)-curves (with and without the relevant connections) on the same plot.
lasso_df_wo <- tibble(lambda=fit_wo.cv$lambda,
error=fit_wo.cv$cvm,
sd=fit_wo.cv$cvsd)
lasso_df$Model <- "Full Model"
lasso_df_wo$Model <- "Without the Selected Connections"
lasso_uber <- rbind(lasso_df, lasso_df_wo)
ggplot(lasso_uber, aes(x = lambda, y = error, fill=Model)) +
#scale_color_d3() +
#scale_fill_d3()+
geom_ribbon(aes(ymin = error - sd,
ymax = error + sd),
alpha = 0.5,
#fill="blue"
) +
geom_line(aes(col=Model), lwd=2) +
xlab(expression(lambda)) +
ylab("Cross-Validation Error") +
ggtitle(expression(paste(bold("Cross Validation Error Across "), lambda))) +
geom_vline(xintercept = fit.cv$lambda.min,
linetype="dashed") +
ylim(0,1) +
theme_pander() +
theme(legend.position="bottom")
Then, we examine the Variance Inflation Factor (VIF). To calculate the VIF, we need to first create a linear model of the factor effects:
dfX <- data.frame(cbind(Y, X[, betas != 0]))
#dfX <- data.frame(cbind(Y, X[conn_betas[conn_betas$Beta!=0,]$index]))
mod<-lm(Y ~ . + 1, as.data.frame(dfX))
We can now calculate the VIF and turn the results into a tibble:
vifs <- vif(mod)
vifsT <- tibble(VIF = vifs)
And, finally, we can plot an histogram of the distribution of VIF values. VIFs values < 10 are considered non-collinear; VIFs values < 5 are great. All of our factors have VIF values that a re much smaller than 5, which implies that they are as close to a normal basis set as possible.
ggplot(vifsT, aes( x =VIF)) +
geom_histogram(col="white", binwidth = 0.1, fill="blue", alpha=0.4) +
theme_pander() +
xlab("VIF Value") +
ylab("Number of Predictors") +
ggtitle("Distribution of Variance Inflation Factors")
Save RData
save.image(file = "../data/__cache__/worksapce_ml.RData")